- Library und Data
library(tidyverse)
library(dplyr)
library(data.table)
library(ggplot2)
library(reshape2)
library(rsample)
library(recommenderlab)
data(MovieLense)
- Explorative Datenanalyse
mx_user_film <- as(MovieLense, "matrix") # convert realratingmatrix to normal matrix
df_user_film <- as.data.frame(mx_user_film) # convert matrix to dataframe form
df_film_user <- as.data.frame(t(mx_user_film)) # transpose the dataframe: each row is a movie name, each column is a user
2.1 Welches sind die am häufigsten geschauten Genres/Filme?
df_21 <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user))) %>% arrange(desc(cnt)) %>% filter(cnt == max(cnt)) %>% select('cnt')
df_21
Die am häufigsten geschauten Filme ist Star Wars.
2.2 Wie verteilen sich die Kundenratings gesamthaft und nach Genres?
df_unlist <- data.frame(rating=unlist(df_film_user)) # unlist the dataframe
ggplot(df_unlist,aes(rating)) + geom_histogram() + # die Verteilung der Kundenratings gesamthaft
labs(x="Ratings", y="Count",title="Distribution of the user ratings") +
theme(plot.title = element_text(hjust = 0.5))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 1469760 rows containing non-finite values (stat_bin).

The above histogram of ratings distribution is left skewed, with the mode = 4.
mx_film_genre <- as.data.frame(MovieLenseMeta)
rownames(mx_film_genre) <- mx_film_genre$title
mx_film_genre <- as.matrix(mx_film_genre[,5:22]) # Movie Genre Matrix
mx_user_film[is.na(mx_user_film)] <- 0
mx_user_genre <- mx_user_film %*% mx_film_genre
mx_genre_user <- as.data.frame(t(mx_user_genre)) # a: Stärke Genre Kombination vollständig
mx_genre_user$summe <- rowSums(mx_genre_user) # new column "summe": summe user ratings of each genre
mx_genre_user <- cbind(genre = rownames(mx_genre_user), mx_genre_user)# new column "genre": genre name copied from rownames
ggplot(mx_genre_user,aes(summe,genre)) + geom_col() + labs(x= "summed ratings of all users", y="Genre",title="Distribution of the user ratings by genre combination") +
theme(plot.title = element_text(hjust = 0.5))

mx_genre_user <- mx_genre_user %>% select(-genre)
Above is the distribution of ratings by genre, “drama” has the highest summed ratings.
df_22 <- as.data.frame(t(mx_film_genre)) %>% mutate(cnt = rowSums(as.data.frame(t(mx_film_genre))))%>% arrange(desc(cnt)) # add new column: count for each genres
df_22 <- cbind(genres = rownames(df_22),df_22) # index as a column
rownames(df_22) <- 1:nrow(df_22) # generate new index
ggplot(df_22,aes(x = (reorder(genres,cnt)), y = cnt)) + geom_col() + coord_flip() +
labs(y="Number of Views", x="Genres",title="Distribution by genres") +
theme(plot.title = element_text(hjust = 0.5))

The plot above shows the distribution of views by genre. The genre drama has the highest number of views.
2.3 Wie verteilen sich die mittleren Kundenratings pro Film?
df_avg_rating_film <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_film,aes(avg_rating)) + geom_histogram(binwidth = 1) + # die Verteilung
labs(x="Mean user-ratings per film", y="Count of films",title="Distribution of the mean user-ratings per film") +
theme(plot.title = element_text(hjust = 0.5))

The plot showed us that the mode of average ratings per film is 3. Most of movies have the average rating larger than 2.
2.4 Wie stark streuen die Ratings von individuellen Kunden?
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5) + # die Verteilung
labs(x="Mean user-ratings per user", y="Count of users",title="Distribution of the mean user-ratings per user")+
theme(plot.title = element_text(hjust = 0.5))

The plot shows the mode of average user ratings per user is 3. Most of the average ratings are 3 or 4.
2.5 Welchen Einfluss hat die Normierung der Ratings pro Kunde auf deren Verteilung?
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 20) + # die Verteilung
labs(x="Mean ratings per user", y="Count of users",title="Distribution of the mean ratings per user")+
theme(plot.title = element_text(hjust = 0.5))

normalized_movielens <- as(normalize(MovieLense,method = "z-score"), "matrix")
normalized_movielens <- as.data.frame(normalized_movielens)
normalized_avg_rating_user <- normalized_movielens %>% mutate(avg_rating=rowMeans(normalized_movielens,na.rm=TRUE, dims=1)) %>% select('avg_rating')
ggplot(normalized_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 20) + # die Verteilung
labs(x="Mean normalized ratings per user", y="Count of users",title="Distribution of the mean normalized ratings per user") +
theme(plot.title = element_text(hjust = 0.5))

without normalization (first plot): the average ratings are slightly left skewed, with the mode of 3.8. The ratings vary a lot across different users.
with Z-score normalization (second plot): the average ratings per user are all around 0. This mean, the ratings from different users are normalized to the same scale, with the mean of 0, the standard deviation of 1. This will reduce the influence of rating habits of different users.
2.6 Welche strukturellen Charakteristika (z.B. Sparsity) und Auffälligkeiten zeigt die User-Item Matrix?
Recommender System data usually contains a large numbers of users(rows) and items (columns), but a single user interacts with only a small subset of the items. This means, the dataframe consists of many zero values, the structure is extremely sparse.
Vor Datenreduktion: 1664 Movies, 943 users, 93.82% Data are NA.
print(dim(df_user_film))
[1] 943 1664
print(sum(is.na(df_user_film))/(1663*942))
[1] 0.9382169
image(as(df_user_film,"matrix"), main = "sparsity of dataframe before reduction")

Nach Datenreduktion: 700 Movies, 400 users, 75.90% data are NA.
print(dim(df_reduced))
[1] 400 700
print(sum(is.na(df_reduced))/(700*400))
[1] 0.7589929
image(as(df_reduced,"matrix"),main = "sparsity of dataframe after reduction")

The two images above showed us the data sparsity before (first image) and after (second image) reduction. The blank pixels represent the NA, the color pixels represent the available values. By comparing the two images, we could see that the first image has more blank and less color pixels than the second one. This means the data after reduction is less sparse than before reduction. Data reduction has successfully reduced the data sparsity.
3.3 mittlere Kundenratings pro Film vor und nach Datenreduktion.
Before data reduction
df_avg_rating <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating,aes(avg_rating)) + geom_histogram(binwidth = 0.25) + # die Verteilung
labs(x="Mean ratings per film", y="Count",title="Before reduction: Distribution of the mean ratings by film") +
theme(plot.title = element_text(hjust = 0.5))

df_reduced_t <- as.data.frame(t(df_reduced))
df_reduced_avg_rating <- df_reduced_t%>% mutate(avg_rating = rowMeans(df_reduced_t,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_reduced_avg_rating,aes(avg_rating)) + geom_histogram(binwidth = 0.25) + # die Verteilung
labs(x="Mean ratings per film", y="Count",title="After reduction: Distribution of the mean ratings by film") +
theme(plot.title = element_text(hjust = 0.5))

After the data reduction, the average ratings are close to a left skewed normal distribution.
4 Analyse Ähnlichkeitsmatrix
4.1 Zerlege den reduzierten MovieLense Datensatz in ein disjunkte Trainings-und Testdatenset im Verhältnis 4:1
set.seed(465)
mx_reduced <- as.matrix(df_reduced)
rrm_reduced <- as(mx_reduced,"realRatingMatrix")
train_test <- evaluationScheme(rrm_reduced, method="split", train=0.8, k=1, given=20, goodRating=4)
# training data 80% of the users
rrm_reduced_train <- getData(train_test,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known <- getData(train_test,"known")
# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown <- getData(train_test,"unknown")
4.2 Trainiere ein IBCF Modell mit 30 Nachbarn und Cosine Similarity
model_IBCF <- Recommender(data = rrm_reduced_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=30))
4.3 Bestimme die Verteilung der Filme, welche bei IBCF für paarweise Ähnlichkeitsvergleiche verwendet werden. Determine the distribution of films used in IBCF for pairwise similarity comparisons
# Here only exhibit the first 50 rows and columns
get_model_IBCF <- getModel(model_IBCF)
image(get_model_IBCF$sim[1:50, 1:50], main = "Similarity of the first 50 rows and columns")

The similarity matrix is not symmetric. Each row has 30 elements larger than 0. In each column, the number of elements greater than 0 indicates how many times this film was included in the TOP list of other films.
IBCF_sim <- as.data.frame(colSums(get_model_IBCF$sim > 0))
colnames(IBCF_sim) <- "recommended_frequency" # frequency that the corresponding film is included in other films' TOP-N lists
ggplot(IBCF_sim, aes(x=IBCF_sim$recommended_frequency))+geom_histogram(fill="black", col="grey",binwidth = 5)+
labs(x = "Recommended frequency", y = "Count", title = "Distribution of recommended frequency per film") +
theme(plot.title = element_text(hjust = 0.5))
Warning: Use of `IBCF_sim$recommended_frequency` is discouraged. Use `recommended_frequency` instead.

The plot displays the distribution of films by how many times the corresponding film included in the TOP list of other films. For instance, about 52 films are not included in any TOP list of other films, about 100 films are included in the TOP lists of 5 films. The highest frequency is about 160.
4.4 Bestimme die Filme, die am häufigsten in der Cosine-Ähnlichkeitsmatrix auftauchen und analysiere deren Vorkommen und Ratings im reduzierten Datensatz.
die am häufigsten Film in der Cosine-Ähnlichkeitsmatrix
# die 10 am häufigsten Filme in der Cosine Ähnlichkeitsmatrix, i.e. die Filme mit der höheste sum Ähnlichkeits
high_freq_film <- IBCF_sim %>% mutate(film = rownames(IBCF_sim)) %>% arrange(desc(recommended_frequency)) %>% slice(0:10) %>% select(film,recommended_frequency)
high_freq_film
The Mouse Hunt is the most often recommended film.
die Vorkommen und Ratings in reduzierten Datensatz
t <- df_reduced_t %>% mutate(is_NA = rowSums(is.na(df_reduced_t)),not_NA = rowSums(!is.na(df_reduced_t)),occurrence = rowSums(!is.na(df_reduced_t))/dim(df_reduced_t)[2], film = rownames(df_reduced_t)) %>% select(is_NA,not_NA,occurrence,film)
Occurrence <- left_join(high_freq_film,t,by = "film") %>% select(film,recommended_frequency,occurrence)
t2 <- df_reduced_t %>% mutate(film = rownames(df_reduced_t), avg_rating = rowMeans(df_reduced_t,na.rm=TRUE))%>% select(film,avg_rating)
Occurrence <- left_join(Occurrence,t2,by = "film")
Occurrence # occurrence: not_NA / user number, the user ratio that rated this film
recommended_frequency: the frequanecy that this item appears in the top-n recommendation list of other users.
occurrence: ratio that the film is rated to all users
avg_rating: the average rating of each film
From the result, we could see that, there are no direct relationship between the three variables. The most often recommended film Mouse Hunt has a relatively low average rating 2.32, and medium occurrence.
5 Analyse Top-N Listen IBCF vs UBCF Vergleiche und diskutiere Top N Empfehlungen von IBCF und UBCF Modellen mit 30 Nachbarn und Cosine Similarity für den reduzierten Datensatz. Analysis Top-N lists IBCF vs UBCF. Compare and discuss top N recommendations from IBCF and UBCF models with 30 neighbors and cosine similarity for the reduced data set.
5.1 Berechne Top 15 Empfehlungen für Testkunden mit IBCF und UBCF
## top-N recommendations for testdata users with IBCF
Pred_IBCF <- predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
TOP15_IBCF <- sapply(Pred_IBCF@items, function(x) {colnames(df_reduced)[x]})
TOP15_IBCF[,1:2] # here only display the top 15 recommendations for the first two test users
655
[1,] "Titanic (1997)"
[2,] "Indiana Jones and the Last Crusade (1989)"
[3,] "Forrest Gump (1994)"
[4,] "Braveheart (1995)"
[5,] "Terminator 2: Judgment Day (1991)"
[6,] "Die Hard (1988)"
[7,] "It's a Wonderful Life (1946)"
[8,] "Hunt for Red October, The (1990)"
[9,] "Aladdin (1992)"
[10,] "Reservoir Dogs (1992)"
[11,] "Good, The Bad and The Ugly, The (1966)"
[12,] "Chasing Amy (1997)"
[13,] "Raging Bull (1980)"
[14,] "Close Shave, A (1995)"
[15,] "Chinatown (1974)"
276
[1,] "English Patient, The (1996)"
[2,] "Twelve Monkeys (1995)"
[3,] "Rock, The (1996)"
[4,] "Star Trek: First Contact (1996)"
[5,] "Fugitive, The (1993)"
[6,] "Forrest Gump (1994)"
[7,] "Birdcage, The (1996)"
[8,] "Jurassic Park (1993)"
[9,] "Dances with Wolves (1990)"
[10,] "Trainspotting (1996)"
[11,] "Fifth Element, The (1997)"
[12,] "Face/Off (1997)"
[13,] "Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)"
[14,] "This Is Spinal Tap (1984)"
[15,] "That Thing You Do! (1996)"
# predict with UBCF
model_UBCF <- Recommender(rrm_reduced_train,method="UBCF",param=list(normalize = "Z-score",method="Cosine",nn=30)) #model
# top-N recommendations for testdata users with UBCF
Pred_UBCF <- predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))
#TOP15_UBCF <- as(Pred_UBCF, "list")
TOP15_UBCF <- sapply(Pred_UBCF@items, function(x) {colnames(df_reduced)[x]})
TOP15_UBCF[,1:2] # here only display the top 15 recommendations for the first two test users
655 276
[1,] "Three Colors: Red (1994)" "Bridges of Madison County, The (1995)"
[2,] "Three Colors: White (1994)" "Dazed and Confused (1993)"
[3,] "Wings of Desire (1987)" "Big Lebowski, The (1998)"
[4,] "Antonia's Line (1995)" "Bronx Tale, A (1993)"
[5,] "Blue in the Face (1995)" "Fierce Creatures (1997)"
[6,] "M (1931)" "Postman, The (1997)"
[7,] "Winnie the Pooh and the Blustery Day (1968)" "Little Princess, A (1995)"
[8,] "As Good As It Gets (1997)" "Nixon (1995)"
[9,] "Wrong Trousers, The (1993)" "Strictly Ballroom (1992)"
[10,] "Sunset Blvd. (1950)" "Reservoir Dogs (1992)"
[11,] "Apartment, The (1960)" "Before Sunrise (1995)"
[12,] "East of Eden (1955)" "Secrets & Lies (1996)"
[13,] "Farewell My Concubine (1993)" "Red Corner (1997)"
[14,] "Postino, Il (1994)" "Crumb (1994)"
[15,] "Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)" "Apt Pupil (1998)"
5.2 Vergleiche die Top 15 Empfehlungen und deren Verteilung und diskutiere Gemeinsamkeiten und Unterschiede zwischen IBCF und UBCF für alle Testkunden.
From above the result for first two users, we could see that the top 15 recommendations for the same user between the IBCF and UBCF models are completely different.
Compare the top 15 recommendations for all test users First identify the most recommended movies in the TOP-15 list from all test users.
# generate frequency tables : all the recommendation films with the corresponding frequencies
film_freq_IBCF <- as.data.frame(table(as.factor(TOP15_IBCF)))
colnames(film_freq_IBCF) <- c("Film_by_IBCF", "Frequency")
film_freq_UBCF <- as.data.frame(table(as.factor(TOP15_UBCF)))
colnames(film_freq_UBCF) <- c("Film_by_UBCF", "Frequency")
head(film_freq_IBCF %>% arrange(desc(Frequency)),15)
head(film_freq_UBCF %>% arrange(desc(Frequency)),15)
ggplot(head(film_freq_IBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_IBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
labs(y="Frequency", x="Film",title="Distribution of the Top-15 films for all the users with IBCF") +
theme(plot.title = element_text(hjust = 0.5))

ggplot(head(film_freq_UBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_UBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
labs(y="Frequency", x="Film",title="Distribution of the Top-15 films for all the users with UBCF") +
theme(plot.title = element_text(hjust = 0.5))

The maximum recommended frequency with IBCF and UBCF are 26 and 28 respectively, and minimum of 15 and 13. However comparing to the IBCF, the distribution in UBCF has a longer tail. This means with the UBCF model, some movies are recommended much more often than the others.
6 Analyse Top-N Listen Ratings Untersuche den Einfluss von Ratings (ordinale vs binäre Ratings) und Modelltyp (IBCF vs UBCF) auf Top N Empfehlungen für den reduzierten Datensatz.
6.1 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für IBCF vs UBCF, beide mit ordinalem Rating und Cosine Similarity für alle Testkunden
# the test user "unknown" ratings
mx_reduced_unknown <- as(rrm_reduced_unknown,"matrix")
# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF <- as(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),"matrix")
pred_rating_UBCF <- as(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),rrm_reduced_unknown)
acc_UB <- calcPredictionAccuracy(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),rrm_reduced_unknown)
acc_ordinal <- rbind(acc_IB,acc_UB)
rownames(acc_ordinal) <- c("IBCF ordinal","UBCF ordinal")
acc_ordinal
RMSE MSE MAE
IBCF ordinal 1.292176 1.669719 0.9504537
UBCF ordinal 1.073440 1.152273 0.8403449
the UBCF model with ordinal ratings and cosine similarity has better accuracy.
6.2 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für IBCF vs UBCF, beide mit binärem Rating und Jaccard Similarity für alle Testkunden
# convert the reduced dataset to binary: ratings > 3 converted as 1, ratings <= 3 converted as 0
df_reduced_bi <- df_reduced
df_reduced_bi[df_reduced_bi <= 3] <- 0
df_reduced_bi[df_reduced_bi > 3] <- 1
set.seed(468)
mx_reduced_bi <- as.matrix(df_reduced_bi)
rrm_reduced_bi <- as(mx_reduced_bi,"realRatingMatrix")
train_test_bi <- evaluationScheme(rrm_reduced_bi, method="split", train=0.8, k=1, given=20)
# training data 80% of the users
rrm_reduced_train_bi <- getData(train_test_bi,"train")
# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known_bi <- getData(train_test_bi,"known")
# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown_bi <- getData(train_test_bi,"unknown")
# train the IBCF or UBCF model on training dataset
model_IBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="IBCF",parameter=list(normalize = "Z-score",method="Jaccard",k=30))
model_UBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="UBCF",parameter=list(normalize = "Z-score",method="Jaccard",nn=30))
# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF_bi <- as(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
pred_rating_UBCF_bi <- as(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
# the test user "unknown" ratings
mx_reduced_unknown_bi <- as(rrm_reduced_unknown_bi,"matrix")
# evaluate recommendations on "unknown" ratings
acc_IB_bi <- calcPredictionAccuracy(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),rrm_reduced_unknown_bi)
acc_UB_bi <- calcPredictionAccuracy(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),rrm_reduced_unknown_bi)
acc_bi <- rbind(acc_IB_bi,acc_UB_bi)
rownames(acc_bi) <- c("IBCF binary","UBCF binary")
acc_bi
RMSE MSE MAE
IBCF binary 0.6302688 0.3972388 0.3976143
UBCF binary 0.4748717 0.2255031 0.3975344
the UBCF model with binary ratings and cosine similarity has better accuracy.
6.3 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für UBCF mit ordinalem (Cosine Similarity) vs binärem Rating (Jaccard Similarity) für alle Testkunden.
rbind(acc_ordinal,acc_bi)
RMSE MSE MAE
IBCF ordinal 1.2921763 1.6697195 0.9504537
UBCF ordinal 1.0734398 1.1522730 0.8403449
IBCF binary 0.6302688 0.3972388 0.3976143
UBCF binary 0.4748717 0.2255031 0.3975344
The model with binary ratings have largely improved the accuracy comparing to the models with ordinal ratings.
7 Analyse Top-N Listen -IBCF vs SVD Aufgabe: Vergleiche Memory-based IBCF und Modell-based SVD Recommenders bezüglich Überschneidung ihrer Top-N Empfehlungen für die User-Item Matrix des reduzierten Datensatzes (Basis: IBCF mit 30 Nachbarn und Cosine Similarity).
- Vergleiche wie sich der Anteil übereinstimmender Empfehlungen der Top-15 Liste für IBCF vs verschiedene SVD Modelle verändert, wenn die Anzahl der Singulärwerte für SVD von 10 auf 20, 30, 40, 50 verändert wird.
# SVD MODEL
model_SVD_10 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_20 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=20))
model_SVD_30 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=30))
model_SVD_40 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=40))
model_SVD_50 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=50))
# evaluate recommendations on "unknown" ratings
acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_20 <- calcPredictionAccuracy(predict(object = model_SVD_20, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_30 <- calcPredictionAccuracy(predict(object = model_SVD_30, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_40 <- calcPredictionAccuracy(predict(object = model_SVD_40, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_IBCF_top15 <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_IB <- rbind(acc_SVD_10,acc_SVD_20,acc_SVD_30,acc_SVD_40,acc_SVD_50,acc_IBCF_top15)
rownames(acc_SVD_IB) <- c("SVD_k_10","SVD_k_20","SVD_k_30","SVD_k_40","SVD_k_50","IBCF_cos_k_30")
acc_SVD_IB
TP FP FN TN N precision recall TPR FPR
SVD_k_10 6.8375 8.1625 74.5500 590.4500 680 0.4558333 0.09185763 0.09185763 0.01346130
SVD_k_20 6.0625 8.9375 75.3250 589.6750 680 0.4041667 0.07887917 0.07887917 0.01478280
SVD_k_30 5.9250 9.0750 75.4625 589.5375 680 0.3950000 0.07734909 0.07734909 0.01500427
SVD_k_40 6.0250 8.9750 75.3625 589.6375 680 0.4016667 0.07828355 0.07828355 0.01484067
SVD_k_50 5.5375 9.4625 75.8500 589.1500 680 0.3691667 0.07166389 0.07166389 0.01567906
IBCF_cos_k_30 5.7875 9.2125 75.6000 589.4000 680 0.3858333 0.07194244 0.07194244 0.01518621
The model with SVD and 10 neighbors have the best precision and recall.
SVD k=10 > SVD k=20 > SVD k=40 > SVD k=30 > IBCF cosine k=30 > SVD k=50
8 Wahl des optimalen Recommenders Aufgabe: Bestimme aus 5 unterschiedlichen Modellen das hinsichtlich Top-N Empfehlungen beste Modell. Begründe deine Modellwahlen aufgrund der bisher gemachten Erkenntnisse und verwende als 6. Modell einen Top-Movie Recommender (Basis: reduzierter Datensatz).
8.1 Verwende für die Evaluierung 10-fache Kreuzvalidierung
#create 10-fold cross validation scheme
set.seed(6954)
scheme <- evaluationScheme(rrm_reduced, method="cross", k=10, given=20, goodRating=4)
# evaluate with different methods
cv_IBCF <- evaluate(scheme, method="IBCF", type = "topNList",parameter=list(normalize = "Z-score",method="cosine",k=30),n=15)
IBCF run fold/sample [model time/prediction time]
1 [2.01sec/0.04sec]
2 [1.23sec/0.04sec]
3 [1.65sec/0.04sec]
4 [0.89sec/0.03sec]
5 [1.28sec/0.04sec]
6 [1.25sec/0.04sec]
7 [1sec/0.03sec]
8 [1.06sec/0.03sec]
9 [1.07sec/0.04sec]
10 [1.32sec/0.11sec]
cv_UBCF <- evaluate(scheme, method="UBCF", type = "topNList",parameter=list(normalize = "Z-score",method="cosine",nn=30),n=15)
UBCF run fold/sample [model time/prediction time]
1 [0.03sec/0.25sec]
2 [0.02sec/0.22sec]
3 [0.01sec/0.24sec]
4 [0.01sec/0.28sec]
5 [0.03sec/0.23sec]
6 [0.03sec/0.2sec]
7 [0.03sec/0.22sec]
8 [0.01sec/0.22sec]
9 [0.03sec/0.23sec]
10 [0.03sec/0.28sec]
cv_SVD <- evaluate(scheme, method="SVD", type = "topNList",parameter=list(normalize = "Z-score",k=30),n=15)
SVD run fold/sample [model time/prediction time]
1 [0.24sec/0.03sec]
2 [0.27sec/0.05sec]
3 [0.23sec/0.05sec]
4 [0.26sec/0.05sec]
5 [0.26sec/0.13sec]
6 [0.36sec/0.08sec]
7 [0.31sec/0.03sec]
8 [0.25sec/0.05sec]
9 [0.28sec/0.04sec]
10 [0.27sec/0.11sec]
cv_RANDOM <- evaluate(scheme,method="RANDOM",type="topNList",n=15)
RANDOM run fold/sample [model time/prediction time]
1 [0sec/0.05sec]
2 [0sec/0.05sec]
3 [0sec/0.05sec]
4 [0.01sec/0.03sec]
5 [0sec/0.03sec]
6 [0sec/0.05sec]
7 [0.01sec/0.03sec]
8 [0sec/0.03sec]
9 [0sec/0.03sec]
10 [0sec/0.03sec]
cv_POP <- evaluate(scheme, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=15)
POPULAR run fold/sample [model time/prediction time]
1 [0.03sec/0.19sec]
2 [0.03sec/0.17sec]
3 [0.09sec/0.14sec]
4 [0.03sec/0.11sec]
5 [0.03sec/0.15sec]
6 [0.03sec/0.15sec]
7 [0.04sec/0.17sec]
8 [0.05sec/0.17sec]
9 [0.03sec/0.17sec]
10 [0.03sec/0.14sec]
# get the averaged evaluation results
Result_81 <- rbind(avg(cv_IBCF),avg(cv_UBCF),avg(cv_SVD),avg(cv_RANDOM),avg(cv_POP))
rownames(Result_81) <- c("IBCF","UBCF","SVD","RANDOM","POPULAR")
Result_81
TP FP FN TN N precision recall TPR FPR n
IBCF 5.9325 9.0675 78.8625 586.1375 680 0.3955000 0.07568762 0.07568762 0.01501425 15
UBCF 2.0375 12.9625 82.7575 582.2425 680 0.1358333 0.02445926 0.02445926 0.02178287 15
SVD 5.8900 9.1100 78.9050 586.0950 680 0.3926667 0.07739847 0.07739847 0.01513654 15
RANDOM 3.7675 11.2325 81.0275 583.9725 680 0.2511667 0.04431406 0.04431406 0.01871414 15
POPULAR 7.7900 7.2100 77.0050 587.9950 680 0.5193333 0.10232189 0.10232189 0.01184747 15
The model with popular method has the best precision and recall.
Popular > IBCF ~ SVD > RANDOM > UBCF
8.2 Begründe deine Wahl der Performance Metrik,
Higher precision means that an algorithm returns more relevant results than irrelevant ones, and high recall means that an algorithm returns most of the relevant results (whether or not irrelevant ones are also returned).
A perfect precision score of 1.0 means that every result retrieved was relevant (but says nothing about whether all relevant documents were retrieved) whereas a perfect recall score of 1.0 means that all relevant documents were retrieved by the search (but says nothing about how many irrelevant documents were also retrieved)
The model Popular returns the highest score of both precision and recall.
Popular > SVD ~ IBCF > RANDOM > UBCF
8.3 Analysiere das beste Modell für Top-N Recommendations mit N gleich 10, 15, 20, 25 und 30,
POP_results <- evaluate(scheme, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=c(10,15,20,25,30))
POPULAR run fold/sample [model time/prediction time]
1 [0.05sec/0.26sec]
2 [0.05sec/0.19sec]
3 [0.03sec/0.11sec]
4 [0.03sec/0.13sec]
5 [0.03sec/0.14sec]
6 [0.03sec/0.12sec]
7 [0.04sec/0.15sec]
8 [0.03sec/0.16sec]
9 [0.03sec/0.15sec]
10 [0.03sec/0.14sec]
avg_POP_results <- avg(POP_results)
avg_POP_results
TP FP FN TN N precision recall TPR FPR n
[1,] 5.4325 4.5675 79.3625 590.6375 680 0.5432500 0.07252614 0.07252614 0.007496331 10
[2,] 7.7900 7.2100 77.0050 587.9950 680 0.5193333 0.10232189 0.10232189 0.011847466 15
[3,] 9.8050 10.1950 74.9900 585.0100 680 0.4902500 0.12716681 0.12716681 0.016767608 20
[4,] 11.6650 13.3350 73.1300 581.8700 680 0.4666000 0.15028748 0.15028748 0.021961408 25
[5,] 13.4225 16.5775 71.3725 578.6275 680 0.4474167 0.17192212 0.17192212 0.027340196 30
When I increase the N, the “recall” is getting better (larger value), but the “precision” is getting worse (smaller value).
8.4 Optimiere dein bestes Modell hinsichtlich Hyperparameter. Hinweis: Verwende für den Top-Movie Recommender die Filme mit den höchsten Durchschnittsratings.
# films with only the highest average ratings (ratings > 3)
df_top_avg <- as.data.frame(t(df_reduced))
df_top_avg <- df_top_avg %>% mutate(avg_rating = rowMeans(df_top_avg,na.rm=TRUE,dims=1))%>% arrange(desc(avg_rating))%>% filter(avg_rating>3) %>% select(-avg_rating)
rrm_top_avg <- as(t(df_top_avg),"realRatingMatrix")
set.seed(846954)
scheme_top_avg <- evaluationScheme(rrm_top_avg, method="cross", k=10, given=20, goodRating=4)
# the model Popular has only one parameter: normalize. Here I will compare two normalization methods: z-score and center
POP_top_avg_z <- avg(evaluate(scheme_top_avg, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=c(10,15,20,25,30)))
POPULAR run fold/sample [model time/prediction time]
1 [0.03sec/0.11sec]
2 [0.04sec/0.12sec]
3 [0.04sec/0.12sec]
4 [0.06sec/0.17sec]
5 [0.05sec/0.19sec]
6 [0.03sec/0.12sec]
7 [0.03sec/0.14sec]
8 [0.05sec/0.14sec]
9 [0.03sec/0.17sec]
10 [0.04sec/0.18sec]
POP_top_avg_center <- avg(evaluate(scheme_top_avg, method="POPULAR", type = "topNList",parameter=list(normalize = "center"),n=c(10,15,20,25,30)))
POPULAR run fold/sample [model time/prediction time]
1 [0.01sec/0.13sec]
2 [0.03sec/0.23sec]
3 [0.02sec/0.12sec]
4 [0sec/0.14sec]
5 [0.02sec/0.12sec]
6 [0sec/0.14sec]
7 [0.01sec/0.13sec]
8 [0.01sec/0.13sec]
9 [0.02sec/0.15sec]
10 [0.01sec/0.19sec]
diff_z_center <- cbind((POP_top_avg_z - POP_top_avg_center)[,6:7],POP_top_avg_z[,10])
POP_top_avg_z; POP_top_avg_center; diff_z_center
TP FP FN TN N precision recall TPR FPR n
[1,] 5.3750 4.6250 74.2000 461.8000 546 0.5375000 0.07540714 0.07540714 0.009630248 10
[2,] 7.6650 7.3350 71.9100 459.0900 546 0.5110000 0.10625880 0.10625880 0.015300434 15
[3,] 9.5875 10.4125 69.9875 456.0125 546 0.4793750 0.13255558 0.13255558 0.021790133 20
[4,] 11.3750 13.6250 68.2000 452.8000 546 0.4550000 0.15450913 0.15450913 0.028541074 25
[5,] 12.9425 17.0575 66.6325 449.3675 546 0.4314167 0.17375578 0.17375578 0.035781723 30
TP FP FN TN N precision recall TPR FPR n
[1,] 5.380 4.620 74.195 461.805 546 0.5380000 0.07560488 0.07560488 0.009622775 10
[2,] 7.705 7.295 71.870 459.130 546 0.5136667 0.10685028 0.10685028 0.015212167 15
[3,] 9.585 10.415 69.990 456.010 546 0.4792500 0.13201246 0.13201246 0.021785083 20
[4,] 11.390 13.610 68.185 452.815 546 0.4556000 0.15485525 0.15485525 0.028504659 25
[5,] 13.040 16.960 66.535 449.465 546 0.4346667 0.17484967 0.17484967 0.035553745 30
precision recall
[1,] -0.000500000 -0.0001977426 10
[2,] -0.002666667 -0.0005914847 15
[3,] 0.000125000 0.0005431183 20
[4,] -0.000600000 -0.0003461215 25
[5,] -0.003250000 -0.0010938851 30
Here I tried to optimize the popular model through the normalization parameter. The two normaliazation methods z-score and center have very similiar performance on the precision and recall. The models with z-score has slightly better performance than the center normalization with n = 10, 15, 25, 30. The model with center normalization performed a bit better with n = 20.
9 Implementierung Ähnlichkeitsmatrix
Aufgabe DIY: Implementiere eine Funktion zur effizienten Berechnung von sparsen Ähnlichkeitsmatrizen für IBCF RS und analysiere die Resultate für 100 zufällig gewählte Filme.
9.1 Implementiere eine Funktion, um für ordinale Ratings effizient die Cosine Similarity zu berechnen,
cos_similarity <- function(mx){
n <- dim(mx)[2]
mx_0 <- mx
mx_0[is.na(mx_0)] <- 0
sim_mx <- matrix(1:n^2, nrow = n)
for(i in 1:n){
for(j in 1:n){
numerator <- t(mx_0[,i]) %*% mx_0[,j]
denominator <- sqrt(sum(mx_0[,i]^2))*sqrt(sum(mx_0[,j]^2))
sim_mx[i,j] <- numerator/denominator
}
}
return(sim_mx)
}
cos_sim_reduced_1 <- cos_similarity(df_reduced)
9.2 Implementiere eine Funktion, um für binäre Ratings effizient die Jaccard Similarity zu berechnen,
Jacc_similarity <- function(mx){
mx_bi <- mx
mx_bi[mx_bi <= 3] <- 0
mx_bi[is.na(mx_bi)] <- 0 # the NA and ratings <= 3 all converted as 0
mx_bi[mx_bi > 3] <- 1 # the ratings > 3 (which shows a preference) converted as 1
n <- dim(mx_bi)[2]
sim_mx <- matrix(1:n^2, nrow = n) # create a matrix with dimention of n x n for similarity
for(i in 1:n){
for(j in 1:n){
diff <- sum(abs(mx_bi[,i] - mx_bi[,j])) # the sum of absolute difference between two vectors: since the pairs are either same or with the difference of 1, this means the result shows also how many pairs are different.
sim_mx[i,j] <- 1 - diff/n
}
}
return(sim_mx)
}
Jacc_sim_reduced <- Jacc_similarity(df_reduced) # a 700 x 700 similarity matrix
9.3 Vergleiche deine Implementierung der Cosine-basierten Ähnlichkeitsmatrix für ordinale Kundenratings mit der korrespondierenden via Open Source Paketen erzeugten Ähnlichkeitsmatrix,
mx_reduced_0 <- mx_reduced
mx_reduced_0[is.na(mx_reduced_0)]<-0 # replace NA as 0
cos_sim_reduced_2 <- as.matrix(similarity(as(mx_reduced_0,"realRatingMatrix"), method = "cosine", which = "items")) # calculate the cosine similarity matrix by open source package
# Since the cos similarity matrix by this method use 0 for all the diagonal elements, where all are 1 by the upper function to remove this effect, here i will refill the diagonal with 1
diag(cos_sim_reduced_2)<-1
compare_two_cos_sim_methods <- all.equal(cos_sim_reduced_1, cos_sim_reduced_2, tolerance = 1e-10,check.attributes = FALSE)
compare_two_cos_sim_methods
[1] TRUE
The cosine similarity matrices by two different methods are equal (with tolerance of 1e-10).
9.4 Vergleiche und diskutiere die Unterschiede deiner mittels Cosine Similarity erzeugten Ähnlichkeitsmatrizen für ordinale und normierte Kundenratings mit der Jaccard-basierten Ähnlichkeitsmatrix.
compare_cos_Jacc <- all.equal(cos_sim_reduced_1,Jacc_sim_reduced,tolerance = 1e-3,check.attributes = FALSE)
compare_cos_Jacc
[1] "Mean relative difference: 2.480877"
The mean relative difference between cosine similarity and jaccard similarity is 2.48.
Jaccard similarity takes only the unique set of items. The cosine similarity takes the total length of the vectors.
10 Implementierung Top-N Metriken
Aufgabe DIY: Implementiere Funktionen für die Beurteilung der Top-N Metriken Precision und Recall sowie für alle Kunden der Item-space Coverage und Novelty und teste diese mit IBCF Recommendations (Basis: reduzierter Datensatz; N = 5, 10, 15, 20, 25, 30)
10.1 Implementiere eine Funktion, um aus Top-N Listen für alle Kunden die Item-space Coverage@N und Novelty@N eines Recommenders zu beurteilen und teste diese.
calc_topn_metrics <- function(mx,split_ratio,N){ # mx: U_I data; split_ratio:train data proportion; n: Top-N
rrm <- as(mx,"realRatingMatrix")
# split train, test-known, test_unknown data
train_test <- evaluationScheme(rrm, method="split", train=split_ratio, k=1, given=20,goodRating=4)
rrm_train <- getData(train_test,"train")
rrm_known <- getData(train_test,"known")
rrm_unknown <- getData(train_test,"unknown")
# IBCF model
model_IBCF <-Recommender(data = rrm_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=10))
# predict Top-N recommendation list
pred_IBCF <- predict(object = model_IBCF, newdata = rrm_known, n = N,type="topNList")
####################################
### accuracy: evaluate the recommendations on "unknown" ratings with metrics precision and recall
acc_IBCF <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_known, n = N,type="topNList"),rrm_unknown,given=20,goodRating = 4)
acc_IBCF <- t(as.data.frame(acc_IBCF))
rownames(acc_IBCF) <- NULL
####################################
### item-space coverage: how many percentage of films(from the train data) are in the top-n recommendation lists
# top n lists for every user
TOP_N_list <- sapply(pred_IBCF@items, function(x) {colnames(as(rrm_known,"matrix"))[x]})
# unique predicted film list of all test users
uniq_film_test <- reshape2::melt(as(TOP_N_list,"matrix")) %>% rename(UserID = Var2, rank = Var1, Film_name = value)%>%distinct(Film_name) # unique film list recommended in test data
# unique film list of the train data
uniq_film_train <- as.data.frame(t(mx))
uniq_film_train$cnt <- rowSums(!is.na(uniq_film_train)) # count not NA for each film
uniq_film_train <- uniq_film_train %>% filter(cnt>0) # remove the film without any ratings
# calculate the item-space coverage
coverage <- dim(uniq_film_test)[1] / dim(uniq_film_train)[1] # the coverage
coverage <- as.data.frame(coverage)
colnames(coverage) <- "coverage"
####################################
### novelty for a given user: ratio of unknown items in the top-n list
novelty_table <- data.frame() # an empty dataframe, will be filled with novelty values
df <- as.data.frame(mx)
pred_IBCF_all_user <- predict(object = model_IBCF, newdata = rrm, n = N,type="topNList") # predict for all users
TOP_N_list_all_user <- sapply(pred_IBCF_all_user@items, function(x) {colnames(mx)[x]}) # top-n list for all users
# df_1: replace the not NA values to the corresponding column name
for(i in 1:dim(mx)[1]){
df_i <- as.data.frame(t(mx))#
df_i$Film <- colnames(mx)
df_i <- df_i[,c(i,(dim(mx)[1]+1))] %>% filter(complete.cases(.)) # list of user-i rated films
df_i$Film <- rownames(df_i) # add a new column with the same content of rownames
df_top_n <- as.data.frame(TOP_N_list_all_user[,i]) # top-n list of user-i
colnames(df_top_n) <- "Film"
df_cross <- inner_join(df_i, df_top_n, by="Film") # inner join the two dataset, we get the rated items in the top-n list
novelty <- 1 - dim(df_cross)[1]/N # novelty value of user-i
novelty_table <- rbind(novelty_table, novelty)
}
novelty_table$UserID <- rownames(mx)
colnames(novelty_table) <- c("novelty","UserID")
novelty_table <- novelty_table %>% select(UserID,novelty) # novelty table
### result of accuracy, coverage, and novelty
my_list <- list("accuracy" = acc_IBCF,"coverage" = coverage, "novelty" = novelty_table)
return(my_list)
}
test <- calc_topn_metrics(mx_reduced,0.8,20)
test$accuracy;test$coverage; test$novelty
TP FP FN TN N precision recall TPR FPR
[1,] 6.9 12.9875 77.075 583.0375 680 0.3460227 0.09009878 0.09009878 0.02159826
11 Implementierung Top-N Monitor Aufgabe DIY: Untersuche die relative Übereinstimmung zwischen Top-N Empfehlungen und präferierten Filmen für 4 unterschiedliche Modelle (z.B. IBCF und UBCF mit unterschiedlichen Ähnlichkeits-metriken / Nachbarschaften sowie SVD mit unterschiedlicher Dimensionalitätsreduktion).
11.1 Fixiere 20 zufällig gewählte Testkunden für alle Modellvergleiche,
set.seed(578)
train_test_11 <- evaluationScheme(rrm_reduced, method="split", train=0.95, k=1, given=20,goodRating=4)
# training dataset has 380 users,test dataset has 20 users
# given=20: For each test user, 20 films per user will be used for prediction, the rest for evaluation)
rrm_reduced_train_11 <- getData(train_test_11,"train")
rrm_reduced_known_11 <- getData(train_test_11,"known")
rrm_reduced_unknown_11 <- getData(train_test_11,"unknown")
# ICBF models
model_IBCF_cos_10 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=10))
model_IBCF_cos_50 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=50))
model_IBCF_ps_10 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Pearson",k=10))
model_IBCF_ps_50 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Pearson",k=50))
# UBCF models
model_UBCF_cos_10 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Cosine",nn=10))
model_UBCF_cos_50 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Cosine",nn=50))
model_UBCF_ps_10 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Pearson",nn=10))
model_UBCF_ps_50 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Pearson",nn=50))
# SVD models
model_SVD_10 <- Recommender(data = rrm_reduced_train_11,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_50 <- Recommender(data = rrm_reduced_train_11,method="SVD",parameter=list(normalize = "Z-score",k=50))
# evaluation of the predictions
acc_IBCF_cos_10 <- calcPredictionAccuracy(predict(object = model_IBCF_cos_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_cos_50 <- calcPredictionAccuracy(predict(object = model_IBCF_cos_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_ps_10 <- calcPredictionAccuracy(predict(object = model_IBCF_ps_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_ps_50 <- calcPredictionAccuracy(predict(object = model_IBCF_ps_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_cos_10 <- calcPredictionAccuracy(predict(object = model_UBCF_cos_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_cos_50 <- calcPredictionAccuracy(predict(object = model_UBCF_cos_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_ps_10 <- calcPredictionAccuracy(predict(object = model_UBCF_ps_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_ps_50 <- calcPredictionAccuracy(predict(object = model_UBCF_ps_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_table <- rbind(acc_IBCF_cos_10,acc_IBCF_cos_50,acc_IBCF_ps_10,acc_IBCF_ps_50,acc_UBCF_cos_10,acc_UBCF_cos_50,acc_UBCF_ps_10,acc_UBCF_ps_50,acc_SVD_10,acc_SVD_50)
acc_table
TP FP FN TN N precision recall TPR FPR
acc_IBCF_cos_10 6.30 8.70 87.15 577.85 680 0.4200000 0.07252980 0.07252980 0.01456784
acc_IBCF_cos_50 6.75 8.25 86.70 578.30 680 0.4500000 0.07716093 0.07716093 0.01370982
acc_IBCF_ps_10 3.10 11.15 90.35 575.40 680 0.2364881 0.03374307 0.03374307 0.01900455
acc_IBCF_ps_50 5.15 9.85 88.30 576.70 680 0.3433333 0.05386252 0.05386252 0.01642536
acc_UBCF_cos_10 2.85 12.15 90.60 574.40 680 0.1900000 0.03506794 0.03506794 0.02069613
acc_UBCF_cos_50 2.45 12.55 91.00 574.00 680 0.1633333 0.03580625 0.03580625 0.02140033
acc_UBCF_ps_10 2.30 12.70 91.15 573.85 680 0.1533333 0.02657352 0.02657352 0.02167117
acc_UBCF_ps_50 2.45 12.55 91.00 574.00 680 0.1633333 0.03007206 0.03007206 0.02136549
acc_SVD_10 6.95 8.05 86.50 578.50 680 0.4633333 0.07790237 0.07790237 0.01341930
acc_SVD_50 5.90 9.10 87.55 577.45 680 0.3933333 0.07229660 0.07229660 0.01537873
In the IBCF model, both precision and recall are better with cosine method and 50 neighbors.
In the UBCF model, cosine method and 10 neighbors is a better combination.
In the SVD model, precision and recall are better with singular value of 10.
Through all the models, The SVD model with singular value of 10 has the best precision (0.463) and recall (0.0779).
11.2 Bestimme den Anteil der Top-N Empfehlung nach Genres pro Kunde,
Top_n_genre <- function(Top_n_list,n){ # mx is user-item matrix; Top_n_list: top-n matrix; n: the "n" in top-n
table = data.frame()
for(i in 1:dim(Top_n_list)[2]){
df_top <- as.data.frame(Top_n_list[,i])
colnames(df_top) <- "Film"
df_film_genre_1 <- as.data.frame(mx_film_genre)
df_film_genre_1$Film <- rownames(df_film_genre_1)
df_top <- left_join(df_top,df_film_genre_1,by=("Film")) %>% select(-Film)
df_top <- df_top[-1,]
total <- sum(df_top)
df_top["ratio",] <- colSums(df_top)/total
df_top <- df_top["ratio",]
table <- rbind(table,df_top)
}
rownames(table) <- colnames(Top_n_list)
return(table)
}
# Top-15 recommendation list of SVD with k = 10
Pred_SVD_k10 <- predict(object = model_SVD_10, newdata = rrm_reduced_known_11, n = 15,type=c("topNList"))
Top15_SVD <- sapply(Pred_SVD_k10@items, function(x) {colnames(df_reduced)[x]})
Top_15_recommendations <- Top_n_genre(Top15_SVD,15) # the percentage of top-15 recommendations by genre per test user
Top_15_recommendations;summary(rowSums(Top_15_recommendations))[-4] # check if the total genre ratio for each user = 1
Min. 1st Qu. Median 3rd Qu. Max.
1 1 1 1 1
The table shows the percentage of genres in the top-15 recommendation list per user.
The sum of each row are all 1, this indicates the total ratio of each user are all correct.
11.3 Bestimme pro Kunde den Anteil nach Genres seiner Top-Filme (=Filme, welche vom Kunden die besten Bewertungen erhalten haben)
Top_film_genre <- function(mx,n){ # mx is user-item matrix, n: top-n rated films
table = data.frame()
for(i in 1:dim(mx)[1]){
df_top <- as.data.frame(t(mx))
df_top$Film <- rownames(df_top)
df_top <- df_top[,c(i,(dim(mx)[1]+1))]
colnames(df_top) <- c("Ratings","Film")
df_top <- df_top %>% filter(complete.cases(.)) %>% arrange(desc(Ratings)) %>% slice(1:n)
df_film_genre_1 <- as.data.frame(mx_film_genre)
df_film_genre_1$Film <- rownames(df_film_genre_1)
df_left_join <- left_join(df_top,df_film_genre_1,by=("Film"))
df_top <- df_left_join[,-(1:2)]
total <- sum(df_top)
df_top["ratio",] <- colSums(df_top)/total
df_top <- df_top["ratio",]
table <- rbind(table,df_top)
}
rownames(table) <- rownames(mx)
return(table)
}
Top_15_films <- Top_film_genre(mx_reduced,15) # genres proportion of the top 15 films for every user
Top_15_films
summary(rowSums(Top_15_films))[-4]
Min. 1st Qu. Median 3rd Qu. Max.
1 1 1 1 1
The first table shows the percentage of genres in the top-15 rated list per user.
The sum of each row are all 1, this indicates the total ratio of each user are all correct.
11.4 Vergleiche pro Kunde Top-Empfehlungen und Top-Filmen nach Genres,
# filter the Top-Filmen with the users only appear in the Top-recommendation
Top_15_films_reduced <- Top_15_films
Top_15_films_reduced$UserID <- rownames(Top_15_films_reduced)
Top_15_films_reduced <- Top_15_films_reduced %>% filter(UserID %in% rownames(Top_15_recommendations)) %>% select(-UserID) # with 20 users
# calculate the mean absolute error
MAE_top_genre <- rowSums(abs(Top_15_films_reduced - Top_15_recommendations))/20
"MAE between Top-recommendations and Top-films by genres per user:"; MAE_top_genre;"the five number statistics of the MAE:";summary(MAE_top_genre)[-4]
[1] "MAE between Top-recommendations and Top-films by genres per user:"
393 417 279 474 472 379 823 267 577 536
0.03677885 0.02048067 0.02954545 0.04768892 0.02958937 0.02647059 0.02817204 0.03555556 0.03646552 0.04252252
899 484 361 391 901 215 323 918 828 552
0.03333333 0.02379032 0.04028122 0.03775388 0.02978177 0.04346591 0.03600000 0.03081897 0.04404894 0.02547093
[1] "the five number statistics of the MAE:"
Min. 1st Qu. Median 3rd Qu. Max.
0.02048067 0.02920210 0.03444444 0.03838572 0.04768892
"MSE between Top-recommendations and Top-films by genres per user:"
[1] "MSE between Top-recommendations and Top-films by genres per user:"
MSE_top_genre <- rowSums((Top_15_films_reduced - Top_15_recommendations)^2)/20
MSE_top_genre;"the five number statistics of the MSE:";summary(MSE_top_genre)[-4]
393 417 279 474 472 379 823 267 577 536
0.003692125 0.001040455 0.002926997 0.006015010 0.002028927 0.002454184 0.001581634 0.002370370 0.002941290 0.004354336
899 484 361 391 901 215 323 918 828 552
0.002295684 0.001557035 0.004265842 0.003357985 0.002117153 0.003690761 0.003457500 0.002399952 0.006203655 0.001318853
[1] "the five number statistics of the MSE:"
Min. 1st Qu. Median 3rd Qu. Max.
0.001040455 0.002095096 0.002690591 0.003691102 0.006203655
"RMSE between Top-recommendations and Top-films by genres per user:"
[1] "RMSE between Top-recommendations and Top-films by genres per user:"
RMSE_top_genre <- sqrt(rowSums((Top_15_films_reduced - Top_15_recommendations)^2)/20)
RMSE_top_genre;"the five number statistics of the RMSE:";summary(RMSE_top_genre)[-4]
393 417 279 474 472 379 823 267 577 536
0.06076286 0.03225609 0.05410173 0.07755650 0.04504361 0.04953972 0.03976977 0.04868645 0.05423366 0.06598740
899 484 361 391 901 215 323 918 828 552
0.04791330 0.03945928 0.06531341 0.05794812 0.04601253 0.06075163 0.05880051 0.04898930 0.07876328 0.03631601
[1] "the five number statistics of the RMSE:"
Min. 1st Qu. Median 3rd Qu. Max.
0.03225609 0.04577030 0.05182073 0.06075444 0.07876328
Three quantitativ metrics MAE(mean average error), MSE(mean squared error), RMSE(the root mean squared error) were used to compare the difference between the top-recommendations and top-rated-films by genres.
11.5 Definiere eine Qualitätsmetrik für Top-N Listen und teste sie.
# MAP: Average Precision and Mean Average Precision
MAP <- function(mx,Top_n_list,n){
# extract the users in the Top-n lists, or use direct the test dataset.
mx_part <- as.data.frame(mx) %>% filter(rownames(as.data.frame(mx)) %in% colnames(as.data.frame(Top_n_list)))# user_film
mx_part <- as.data.frame(t(mx_part)) # transpose mx_part_users to film_user
mx_part$Film <- rownames(mx_part) # generate new column "Film" same as the rownames
Top_n_list <- as.data.frame(Top_n_list)
Top_n_list$Rank <- 1:n # generate new column "Rank" to represent the ranks of the recommended films
summe_precision <- 0
for(i in (dim(mx_part)[2]-1)){
Top_i <- Top_n_list[,c(all_of(i),dim(Top_n_list)[2])] # extract the top_n_list and rank of the i-th user
colnames(Top_i) <- c("Film","Rank") # rename the columns as "Film" and "Rank"
mx_i <- mx_part %>% select(c(all_of(i),dim(mx_part)[2])) # mx_i: extract the ratings and Film names of i-th user from rating matrix
colnames(mx_i) <- c("Rating","Film")
mx_join <- left_join(Top_i,mx_i,by="Film") %>% filter(Rating>3) # left_join the ratings to the Top-n list by "Film". mx_1 has the columns of "Film", "Rank", and ratings; filter the relevant items (ratings greater than 3);
mx_join <- mx_join %>% mutate(Numerator = 1:dim(mx_join)[1],Precision = Numerator/Rank) # generate new column "Numerator" which is a new rank only for the relevant items, and new column "Precision" which is the Precision of every relevant items.
avg_user_i_precision <- mean(mx_join$Precision) # average precision of user-i
summe_precision <- summe_precision + avg_user_i_precision
}
map <- summe_precision/(dim(Top_n_list)[2]-1) # mean average precision
return(map)
}
MAP(mx_reduced,TOP15_IBCF,15)
[1] 0.008875812
firstly, for one user, find out the rank of the m-th relevant item (rating > 3) in the top_n_list, then calculate the precision: m/n.
secondly, calculate the precisions of all relevant items.
the average precision of one user: average all the precision of relevant items.
MAP: average precision of all users.
---
title: "Collaborative Movie Recommender"
output: html_notebook
editor_options: 
  markdown: 
    wrap: 72
---

1.  Library und Data

```{r}
library(tidyverse)
library(dplyr)
library(data.table)
library(ggplot2)
library(reshape2)
library(rsample)
library(recommenderlab)
data(MovieLense)
```

2.  Explorative Datenanalyse

```{r}
mx_user_film <- as(MovieLense, "matrix")  # convert realratingmatrix to normal matrix
df_user_film <- as.data.frame(mx_user_film)   #  convert matrix to dataframe form
df_film_user <- as.data.frame(t(mx_user_film)) # transpose the dataframe: each row is a movie name, each column is a user
```

2.1 Welches sind die am häufigsten geschauten Genres/Filme?

```{r}
df_21 <- df_film_user %>% mutate(cnt = rowSums(!is.na(df_film_user))) %>% arrange(desc(cnt)) %>% filter(cnt == max(cnt)) %>% select('cnt')
df_21
```
### Die am häufigsten geschauten Filme ist Star Wars.

2.2 Wie verteilen sich die Kundenratings gesamthaft und nach Genres?

```{r}
df_unlist <- data.frame(rating=unlist(df_film_user))            # unlist the dataframe
ggplot(df_unlist,aes(rating)) + geom_histogram() +                # die Verteilung der Kundenratings gesamthaft
  labs(x="Ratings", y="Count",title="Distribution of the user ratings") +
  theme(plot.title = element_text(hjust = 0.5))
```
### The above histogram of ratings distribution is left skewed, with the mode = 4.


```{r}
mx_film_genre <- as.data.frame(MovieLenseMeta) 
rownames(mx_film_genre) <- mx_film_genre$title
mx_film_genre <- as.matrix(mx_film_genre[,5:22])   # Movie Genre Matrix

mx_user_film[is.na(mx_user_film)] <- 0

mx_user_genre <- mx_user_film %*% mx_film_genre

mx_genre_user <- as.data.frame(t(mx_user_genre))    # a: Stärke Genre Kombination vollständig
mx_genre_user$summe <- rowSums(mx_genre_user)               # new column "summe": summe user ratings of each genre
mx_genre_user <- cbind(genre = rownames(mx_genre_user), mx_genre_user)# new column "genre": genre name copied from rownames
ggplot(mx_genre_user,aes(summe,genre)) + geom_col() + labs(x= "summed ratings of all users", y="Genre",title="Distribution of the user ratings by genre combination") + 
  theme(plot.title = element_text(hjust = 0.5))
mx_genre_user <- mx_genre_user %>% select(-genre)
```
### Above is the distribution of ratings by genre, "drama" has the highest summed ratings.

```{r}
df_22 <- as.data.frame(t(mx_film_genre)) %>% mutate(cnt = rowSums(as.data.frame(t(mx_film_genre))))%>% arrange(desc(cnt))  # add new column: count for each genres
df_22 <- cbind(genres = rownames(df_22),df_22)               # index as a column
rownames(df_22) <- 1:nrow(df_22)                            # generate new index

ggplot(df_22,aes(x = (reorder(genres,cnt)), y = cnt)) + geom_col() + coord_flip() +
  labs(y="Number of Views", x="Genres",title="Distribution by genres") + 
  theme(plot.title = element_text(hjust = 0.5))
```
### The plot above shows the distribution of views by genre. The genre drama has the highest number of views.

2.3 Wie verteilen sich die mittleren Kundenratings pro Film?

```{r}
df_avg_rating_film <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_film,aes(avg_rating)) + geom_histogram(binwidth = 1) +                # die Verteilung
  labs(x="Mean user-ratings per film", y="Count of films",title="Distribution of the mean user-ratings per film") + 
  theme(plot.title = element_text(hjust = 0.5))

```
### The plot showed us that the mode of average ratings per film is 3. Most of movies have the average rating larger than 2. 


2.4 Wie stark streuen die Ratings von individuellen Kunden?

```{r}
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 5) +                # die Verteilung
  labs(x="Mean user-ratings per user", y="Count of users",title="Distribution of the mean user-ratings per user")+ 
  theme(plot.title = element_text(hjust = 0.5))
```
### The plot shows the mode of average user ratings per user is 3. Most of the average ratings are 3 or 4.


2.5 Welchen Einfluss hat die Normierung der Ratings pro Kunde auf deren Verteilung?
```{r}
df_avg_rating_user <- df_user_film %>% mutate(avg_rating = rowMeans(df_user_film,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 20) +                # die Verteilung
  labs(x="Mean ratings per user", y="Count of users",title="Distribution of the mean ratings per user")+ 
  theme(plot.title = element_text(hjust = 0.5))
```

```{r}
normalized_movielens <- as(normalize(MovieLense,method = "z-score"), "matrix")
normalized_movielens <- as.data.frame(normalized_movielens)
normalized_avg_rating_user <- normalized_movielens %>% mutate(avg_rating=rowMeans(normalized_movielens,na.rm=TRUE, dims=1)) %>% select('avg_rating')  

ggplot(normalized_avg_rating_user,aes(avg_rating)) + geom_histogram(bins = 20) +                # die Verteilung
  labs(x="Mean normalized ratings per user", y="Count of users",title="Distribution of the mean normalized ratings per user") + 
  theme(plot.title = element_text(hjust = 0.5))
```
### without normalization (first plot): the average ratings are slightly left skewed, with the mode of 3.8. The ratings vary a lot across different users.

### with Z-score normalization (second plot): the average ratings per user are all around 0. This mean, the ratings from different users are normalized to the same scale, with the mean of 0, the standard deviation of 1. This will reduce the influence of rating habits of different users. 

2.6 Welche strukturellen Charakteristika (z.B. Sparsity) und Auffälligkeiten zeigt die User-Item Matrix?

### Recommender System data usually contains a large numbers of users(rows) and items (columns), but a single user interacts with only a small subset of the items. This means, the dataframe consists of many zero values, the structure is extremely sparse.

### The User-Item Matrix of MovieLense dataset is dgCMatrix, which is a class of sparse numeric matrices in the compressed, sparse, column-oriented format. In this implementation the non-zero elements in the columns are sorted into increasing row order.


3 Datenreduktion

3.1 Reduziere den MovieLense Datensatz auf rund 400 Kunden und 700 Filme, indem du Filme und Kunden mit sehr wenigen Ratings entfernst.

```{r}
df_reduced <- df_film_user %>% mutate(n_pro_film = rowSums(!is.na(df_film_user))) %>% arrange(desc(n_pro_film)) %>% slice(0:700)%>% select(-n_pro_film)       # reduce to 700 movies
df_reduced <- as.data.frame(t(df_reduced)) 
df_reduced <- df_reduced %>% mutate(n_pro_user = rowSums(!is.na(df_reduced))) %>% arrange(desc(n_pro_user)) %>% slice(0:400) %>% select(-n_pro_user)  # reduce to 400 users
df_reduced   # with 400 users and 700 films.
```

3.2 Untersuche und dokumentiere die Eigenschaften des reduzierten Datensatzes und beschreibe den Effekt der Datenreduktion: Anzahl Filme und Kunden sowie Sparsity vor und nach Datenreduktion

### Vor Datenreduktion: 1664 Movies, 943 users, 93.82% Data are NA.
```{r}
print(dim(df_user_film))
print(sum(is.na(df_user_film))/(1663*942))
```
```{r}
image(as(df_user_film,"matrix"), main = "sparsity of dataframe before reduction")
```
### Nach Datenreduktion: 700 Movies, 400 users, 75.90% data are NA.

```{r}
print(dim(df_reduced))
print(sum(is.na(df_reduced))/(700*400))
```

```{r}
image(as(df_reduced,"matrix"),main = "sparsity of dataframe after reduction")
```
### The two images above showed us the data sparsity before (first image) and after (second image) reduction. The blank pixels represent the NA, the color pixels represent the available values. By comparing the two images, we could see that the first image has more blank and less color pixels than the second one. This means the data after reduction is less sparse than before reduction. Data reduction has successfully reduced the data sparsity.



3.3 mittlere Kundenratings pro Film vor und nach Datenreduktion.

### Before data reduction

```{r}
df_avg_rating <- df_film_user %>% mutate(avg_rating = rowMeans(df_film_user,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_avg_rating,aes(avg_rating)) + geom_histogram(binwidth = 0.25) +                # die Verteilung
  labs(x="Mean ratings per film", y="Count",title="Before reduction: Distribution of the mean ratings by film") + 
  theme(plot.title = element_text(hjust = 0.5))
```

```{r}
df_reduced_t <- as.data.frame(t(df_reduced))
df_reduced_avg_rating <- df_reduced_t%>% mutate(avg_rating = rowMeans(df_reduced_t,na.rm = TRUE, dims = 1)) %>% select('avg_rating')
ggplot(df_reduced_avg_rating,aes(avg_rating)) + geom_histogram(binwidth = 0.25) +                # die Verteilung
  labs(x="Mean ratings per film", y="Count",title="After reduction: Distribution of the mean ratings by film") + 
  theme(plot.title = element_text(hjust = 0.5))
```
### After the data reduction, the average ratings are close to a left skewed normal distribution.




4 Analyse Ähnlichkeitsmatrix

4.1 Zerlege den reduzierten MovieLense Datensatz in ein disjunkte Trainings-und Testdatenset im Verhältnis 4:1

```{r}
set.seed(465)
mx_reduced <- as.matrix(df_reduced)
rrm_reduced <- as(mx_reduced,"realRatingMatrix")
train_test <- evaluationScheme(rrm_reduced, method="split", train=0.8, k=1, given=20, goodRating=4)

# training data 80% of the users
rrm_reduced_train <- getData(train_test,"train")


# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data

# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known <- getData(train_test,"known")


# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown <- getData(train_test,"unknown")

```

4.2 Trainiere ein IBCF Modell mit 30 Nachbarn und Cosine Similarity

```{r}
model_IBCF <- Recommender(data = rrm_reduced_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=30)) 

```

4.3 Bestimme die Verteilung der Filme, welche bei IBCF für paarweise Ähnlichkeitsvergleiche verwendet werden.
Determine the distribution of films used in IBCF for pairwise similarity comparisons

```{r}
# Here only exhibit the first 50 rows and columns
get_model_IBCF <- getModel(model_IBCF) 
image(get_model_IBCF$sim[1:50, 1:50], main = "Similarity of the first 50 rows and columns")

```

### The similarity matrix is not symmetric. Each row has 30 elements larger than 0. In each column, the number of elements greater than 0 indicates how many times this film was included in the TOP list of other films.

```{r}
IBCF_sim <- as.data.frame(colSums(get_model_IBCF$sim > 0)) 
colnames(IBCF_sim) <- "recommended_frequency" # frequency that the corresponding film is included in other films' TOP-N lists

ggplot(IBCF_sim, aes(x=IBCF_sim$recommended_frequency))+geom_histogram(fill="black", col="grey",binwidth = 5)+
  labs(x = "Recommended frequency", y = "Count", title = "Distribution of recommended frequency per film") +
  theme(plot.title = element_text(hjust = 0.5))

```
### The plot displays the distribution of films by how many times the corresponding film included in the TOP list of other films. For instance, about 52 films are not included in any TOP list of other films, about 100 films are included in the TOP lists of 5 films. The highest frequency is about 160.



4.4 Bestimme die Filme, die am häufigsten in der Cosine-Ähnlichkeitsmatrix auftauchen und analysiere deren Vorkommen und Ratings im reduzierten Datensatz. 

### die am häufigsten Film in der Cosine-Ähnlichkeitsmatrix 
```{r}
# die 10 am häufigsten Filme in der Cosine Ähnlichkeitsmatrix, i.e. die Filme mit der höheste sum Ähnlichkeits
high_freq_film <- IBCF_sim %>% mutate(film = rownames(IBCF_sim)) %>% arrange(desc(recommended_frequency)) %>% slice(0:10)  %>% select(film,recommended_frequency)
high_freq_film 
```
### The Mouse Hunt is the most often recommended film.



### die Vorkommen und Ratings in reduzierten Datensatz
```{r}
t <- df_reduced_t %>% mutate(is_NA = rowSums(is.na(df_reduced_t)),not_NA = rowSums(!is.na(df_reduced_t)),occurrence = rowSums(!is.na(df_reduced_t))/dim(df_reduced_t)[2], film = rownames(df_reduced_t)) %>% select(is_NA,not_NA,occurrence,film)
Occurrence <- left_join(high_freq_film,t,by = "film") %>% select(film,recommended_frequency,occurrence) 
t2 <- df_reduced_t %>% mutate(film = rownames(df_reduced_t), avg_rating = rowMeans(df_reduced_t,na.rm=TRUE))%>% select(film,avg_rating)
Occurrence <- left_join(Occurrence,t2,by = "film")
Occurrence  # occurrence: not_NA / user number, the user ratio that rated this film
```
### recommended_frequency: the frequanecy that this item appears in the top-n recommendation list of other users. 
### occurrence: ratio that the film is rated to all users
### avg_rating: the average rating of each film
### From the result, we could see that, there are no direct relationship between the three variables. The most often recommended film Mouse Hunt has a relatively low average rating 2.32, and medium occurrence.



5 Analyse Top-N Listen IBCF vs UBCF Vergleiche und diskutiere Top N Empfehlungen von IBCF und UBCF Modellen mit 30 Nachbarn und Cosine Similarity für den reduzierten Datensatz.
Analysis Top-N lists IBCF vs UBCF. Compare and discuss top N recommendations from IBCF and UBCF models with 30 neighbors and cosine similarity for the reduced data set.

5.1 Berechne Top 15 Empfehlungen für Testkunden mit IBCF und UBCF

```{r}
## top-N recommendations for testdata users with IBCF
Pred_IBCF <- predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList"))

TOP15_IBCF <- sapply(Pred_IBCF@items, function(x) {colnames(df_reduced)[x]})
TOP15_IBCF[,1:2]  # here only display the top 15 recommendations for the first two test users
```


```{r}
# predict with UBCF
model_UBCF <- Recommender(rrm_reduced_train,method="UBCF",param=list(normalize = "Z-score",method="Cosine",nn=30)) #model
# top-N recommendations for testdata users with UBCF
Pred_UBCF <- predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type=c("topNList")) 
#TOP15_UBCF <- as(Pred_UBCF, "list")
TOP15_UBCF <- sapply(Pred_UBCF@items, function(x) {colnames(df_reduced)[x]})
TOP15_UBCF[,1:2] # here only display the top 15 recommendations for the first two test users
```

5.2 Vergleiche die Top 15 Empfehlungen und deren Verteilung und diskutiere Gemeinsamkeiten und Unterschiede zwischen IBCF und UBCF für alle Testkunden.

### From above the result for first two users, we could see that the top 15 recommendations for the same user between the IBCF and UBCF models are completely different.


Compare the top 15 recommendations for all test users 
First identify the most recommended movies in the TOP-15 list from all test users. 
```{r}
# generate frequency tables : all the recommendation films with the corresponding frequencies 

film_freq_IBCF <- as.data.frame(table(as.factor(TOP15_IBCF))) 
colnames(film_freq_IBCF) <- c("Film_by_IBCF", "Frequency") 

film_freq_UBCF <- as.data.frame(table(as.factor(TOP15_UBCF)))
colnames(film_freq_UBCF) <- c("Film_by_UBCF", "Frequency")

head(film_freq_IBCF %>% arrange(desc(Frequency)),15)
head(film_freq_UBCF %>% arrange(desc(Frequency)),15)
```

```{r}
ggplot(head(film_freq_IBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_IBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
  labs(y="Frequency", x="Film",title="Distribution of the Top-15 films for all the users with IBCF") + 
  theme(plot.title = element_text(hjust = 0.5))
```

```{r}
ggplot(head(film_freq_UBCF %>% arrange(desc(Frequency)),15),aes(x = reorder(Film_by_UBCF,Frequency), y = Frequency)) + geom_col() + coord_flip() +
  labs(y="Frequency", x="Film",title="Distribution of the Top-15 films for all the users with UBCF") + 
  theme(plot.title = element_text(hjust = 0.5))
```

### The maximum recommended frequency with IBCF and UBCF are 26 and 28 respectively, and minimum of 15 and 13. However comparing to the IBCF, the distribution in UBCF has a longer tail. This means with the UBCF model, some movies are recommended much more often than the others.


6 Analyse Top-N Listen Ratings
Untersuche den Einfluss von Ratings (ordinale vs binäre Ratings) und Modelltyp (IBCF vs UBCF) auf Top N Empfehlungen für den reduzierten Datensatz.

6.1 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für IBCF vs UBCF, beide mit ordinalem Rating und Cosine Similarity für alle Testkunden

```{r}
# the test user "unknown" ratings
mx_reduced_unknown <- as(rrm_reduced_unknown,"matrix")

# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF <- as(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),"matrix")
pred_rating_UBCF <- as(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),"matrix")

# evaluate recommendations on "unknown" ratings
acc_IB <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),rrm_reduced_unknown)
acc_UB <- calcPredictionAccuracy(predict(object = model_UBCF, newdata = rrm_reduced_known, n = 15,type="ratings"),rrm_reduced_unknown)
acc_ordinal <- rbind(acc_IB,acc_UB)
rownames(acc_ordinal) <- c("IBCF ordinal","UBCF ordinal")
acc_ordinal
```
### the UBCF model with ordinal ratings and cosine similarity has better accuracy.


6.2 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für IBCF vs UBCF, beide mit binärem Rating und Jaccard Similarity für alle Testkunden

```{r}
# convert the reduced dataset to binary: ratings > 3 converted as 1, ratings <= 3 converted as 0 
df_reduced_bi <- df_reduced
df_reduced_bi[df_reduced_bi <= 3] <- 0
df_reduced_bi[df_reduced_bi > 3] <- 1

set.seed(468)
mx_reduced_bi <- as.matrix(df_reduced_bi)
rrm_reduced_bi <- as(mx_reduced_bi,"realRatingMatrix")
train_test_bi <- evaluationScheme(rrm_reduced_bi, method="split", train=0.8, k=1, given=20)

# training data 80% of the users
rrm_reduced_train_bi <- getData(train_test_bi,"train")

# test data is 20% of the all users, the test data is splited into two parts: known test data and unknown test data
# the known portion returns specified 20 items per test user is used to predict ratings or films for the test users
rrm_reduced_known_bi <- getData(train_test_bi,"known")

# the unknown portion is used to compute the prediction error of the model
rrm_reduced_unknown_bi <- getData(train_test_bi,"unknown")

# train the IBCF or UBCF model on training dataset
model_IBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="IBCF",parameter=list(normalize = "Z-score",method="Jaccard",k=30))
model_UBCF_bi <- Recommender(data = rrm_reduced_train_bi,method="UBCF",parameter=list(normalize = "Z-score",method="Jaccard",nn=30))

# predict the ratings of test users by IBCF and UBCF
pred_rating_IBCF_bi <- as(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")
pred_rating_UBCF_bi <- as(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),"matrix")

# the test user "unknown" ratings
mx_reduced_unknown_bi <- as(rrm_reduced_unknown_bi,"matrix")

# evaluate recommendations on "unknown" ratings
acc_IB_bi <- calcPredictionAccuracy(predict(object = model_IBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),rrm_reduced_unknown_bi)
acc_UB_bi <- calcPredictionAccuracy(predict(object = model_UBCF_bi, newdata = rrm_reduced_known_bi, n = 15,type="ratings"),rrm_reduced_unknown_bi)
acc_bi <- rbind(acc_IB_bi,acc_UB_bi)
rownames(acc_bi) <- c("IBCF binary","UBCF binary")
acc_bi
```
### the UBCF model with binary ratings and cosine similarity has better accuracy.


6.3 Vergleiche den Anteil übereinstimmender Empfehlungen der Top 15 Liste für UBCF mit ordinalem (Cosine Similarity) vs binärem Rating (Jaccard Similarity) für alle Testkunden.

```{r}
rbind(acc_ordinal,acc_bi)
```
### The model with binary ratings have largely improved the accuracy comparing to the models with ordinal ratings.


7  Analyse Top-N Listen -IBCF vs SVD
Aufgabe: Vergleiche Memory-based IBCF und Modell-based SVD Recommenders bezüglich Überschneidung ihrer Top-N Empfehlungen
für die User-Item Matrix des reduzierten Datensatzes (Basis: IBCF mit 30 Nachbarn und Cosine Similarity).

1. Vergleiche wie sich der Anteil übereinstimmender Empfehlungen der Top-15 Liste für IBCF vs verschiedene SVD Modelle verändert, wenn die Anzahl der Singulärwerte für SVD von 10 auf 20, 30, 40, 50 verändert wird.

```{r}

# SVD MODEL
model_SVD_10 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_20 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=20))
model_SVD_30 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=30))
model_SVD_40 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=40))
model_SVD_50 <- Recommender(data = rrm_reduced_train,method="SVD",parameter=list(normalize = "Z-score",k=50))

# evaluate recommendations on "unknown" ratings
acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_20 <- calcPredictionAccuracy(predict(object = model_SVD_20, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_30 <- calcPredictionAccuracy(predict(object = model_SVD_30, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_40 <- calcPredictionAccuracy(predict(object = model_SVD_40, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)

acc_IBCF_top15 <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_reduced_known, n = 15,type="topNList"),rrm_reduced_unknown,given=20,goodRating = 4)

acc_SVD_IB <- rbind(acc_SVD_10,acc_SVD_20,acc_SVD_30,acc_SVD_40,acc_SVD_50,acc_IBCF_top15)
rownames(acc_SVD_IB) <- c("SVD_k_10","SVD_k_20","SVD_k_30","SVD_k_40","SVD_k_50","IBCF_cos_k_30")
acc_SVD_IB
```
### The model with SVD and 10 neighbors have the best precision and recall. 
### SVD k=10 > SVD k=20 > SVD k=40 > SVD k=30 > IBCF cosine k=30 > SVD k=50


8 Wahl des optimalen Recommenders Aufgabe: 
Bestimme aus 5 unterschiedlichen Modellen das hinsichtlich Top-N Empfehlungen beste Modell. 
Begründe deine Modellwahlen aufgrund der bisher gemachten Erkenntnisse und verwende als 6. Modell einen Top-Movie Recommender (Basis: reduzierter Datensatz).


8.1 Verwende für die Evaluierung 10-fache Kreuzvalidierung

```{r}
#create 10-fold cross validation scheme
set.seed(6954)
scheme <- evaluationScheme(rrm_reduced, method="cross", k=10, given=20, goodRating=4)

# evaluate with different methods
cv_IBCF <- evaluate(scheme, method="IBCF", type = "topNList",parameter=list(normalize = "Z-score",method="cosine",k=30),n=15)
cv_UBCF <- evaluate(scheme, method="UBCF", type = "topNList",parameter=list(normalize = "Z-score",method="cosine",nn=30),n=15)
cv_SVD <- evaluate(scheme, method="SVD", type = "topNList",parameter=list(normalize = "Z-score",k=30),n=15)
cv_RANDOM <- evaluate(scheme,method="RANDOM",type="topNList",n=15)
cv_POP <- evaluate(scheme, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=15)

# get the averaged evaluation results
Result_81 <- rbind(avg(cv_IBCF),avg(cv_UBCF),avg(cv_SVD),avg(cv_RANDOM),avg(cv_POP))
rownames(Result_81) <- c("IBCF","UBCF","SVD","RANDOM","POPULAR")
Result_81

```
### The model with popular method has the best precision and recall.
### Popular > IBCF ~ SVD > RANDOM > UBCF


8.2 Begründe deine Wahl der Performance Metrik,

### Higher precision means that an algorithm returns more relevant results than irrelevant ones, and high recall means that an algorithm returns most of the relevant results (whether or not irrelevant ones are also returned).

### A perfect precision score of 1.0 means that every result retrieved was relevant (but says nothing about whether all relevant documents were retrieved) whereas a perfect recall score of 1.0 means that all relevant documents were retrieved by the search (but says nothing about how many irrelevant documents were also retrieved)

### The model Popular returns the highest score of both precision and recall. 
### Popular > SVD ~ IBCF > RANDOM > UBCF


8.3 Analysiere das beste Modell für Top-N Recommendations mit N gleich 10, 15, 20, 25 und 30,

```{r}
POP_results <- evaluate(scheme, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=c(10,15,20,25,30))
avg_POP_results <- avg(POP_results)
avg_POP_results
```
### When I increase the N, the "recall" is getting better (larger value), but the "precision" is getting worse (smaller value).


8.4 Optimiere dein bestes Modell hinsichtlich Hyperparameter.
Hinweis: Verwende für den Top-Movie Recommender die Filme mit den höchsten Durchschnittsratings.

```{r}
# films with only the highest average ratings (ratings > 3)
df_top_avg <- as.data.frame(t(df_reduced))
df_top_avg <- df_top_avg %>% mutate(avg_rating = rowMeans(df_top_avg,na.rm=TRUE,dims=1))%>% arrange(desc(avg_rating))%>% filter(avg_rating>3) %>% select(-avg_rating)
rrm_top_avg <- as(t(df_top_avg),"realRatingMatrix")


set.seed(846954)
scheme_top_avg <- evaluationScheme(rrm_top_avg, method="cross", k=10, given=20, goodRating=4)

# the model Popular has only one parameter: normalize. Here I will compare two normalization methods: z-score and center
POP_top_avg_z <- avg(evaluate(scheme_top_avg, method="POPULAR", type = "topNList",parameter=list(normalize = "Z-score"),n=c(10,15,20,25,30)))

POP_top_avg_center <- avg(evaluate(scheme_top_avg, method="POPULAR", type = "topNList",parameter=list(normalize = "center"),n=c(10,15,20,25,30)))

diff_z_center <- cbind((POP_top_avg_z - POP_top_avg_center)[,6:7],POP_top_avg_z[,10]) 
POP_top_avg_z; POP_top_avg_center; diff_z_center
```
### Here I tried to optimize the popular model through the normalization parameter. The two normaliazation methods z-score and center have very similiar performance on the precision and recall. The models with z-score has slightly better performance than the center normalization with n = 10, 15, 25, 30. The model with center normalization performed a bit better with n = 20. 


9 Implementierung Ähnlichkeitsmatrix

Aufgabe DIY: Implementiere eine Funktion zur effizienten Berechnung von sparsen Ähnlichkeitsmatrizen für IBCF RS und analysiere die Resultate für 100 zufällig gewählte Filme.

9.1 Implementiere eine Funktion, um für ordinale Ratings effizient die Cosine Similarity zu berechnen,

```{r}
cos_similarity <- function(mx){
    n <- dim(mx)[2]
    mx_0 <- mx
    mx_0[is.na(mx_0)] <- 0
    sim_mx <- matrix(1:n^2, nrow = n)
    for(i in 1:n){
      for(j in 1:n){
         
        numerator <- t(mx_0[,i]) %*% mx_0[,j]
        denominator <- sqrt(sum(mx_0[,i]^2))*sqrt(sum(mx_0[,j]^2))
        sim_mx[i,j] <- numerator/denominator
      }
    }
    return(sim_mx)
}

cos_sim_reduced_1 <- cos_similarity(df_reduced)

```


9.2 Implementiere eine Funktion, um für binäre Ratings effizient die Jaccard Similarity zu berechnen,


```{r}
Jacc_similarity <- function(mx){
    mx_bi <- mx
    mx_bi[mx_bi <= 3] <- 0 
    mx_bi[is.na(mx_bi)] <- 0 # the NA and ratings <= 3 all converted as 0
    mx_bi[mx_bi > 3] <- 1 # the ratings > 3 (which shows a preference) converted as 1
    
    n <- dim(mx_bi)[2]
    
    sim_mx <- matrix(1:n^2, nrow = n) # create a matrix with dimention of n x n for similarity 
    for(i in 1:n){
      for(j in 1:n){
        diff <- sum(abs(mx_bi[,i] - mx_bi[,j])) # the sum of absolute difference between two vectors: since the pairs are either same or with the difference of 1, this means the result shows also how many pairs are different.
        sim_mx[i,j] <- 1 - diff/n 
      }
    }
    return(sim_mx)
  }

Jacc_sim_reduced <- Jacc_similarity(df_reduced)  # a 700 x 700 similarity matrix
```



9.3 Vergleiche deine Implementierung der Cosine-basierten Ähnlichkeitsmatrix für ordinale Kundenratings mit der korrespondierenden via Open Source Paketen erzeugten Ähnlichkeitsmatrix,

```{r}
mx_reduced_0 <- mx_reduced
mx_reduced_0[is.na(mx_reduced_0)]<-0  # replace NA as 0
cos_sim_reduced_2 <- as.matrix(similarity(as(mx_reduced_0,"realRatingMatrix"), method = "cosine", which = "items")) # calculate the cosine similarity matrix by open source package

# Since the cos similarity matrix by this method use 0 for all the diagonal elements, where all are 1 by the upper function to remove this effect, here i will refill the diagonal with 1
diag(cos_sim_reduced_2)<-1

compare_two_cos_sim_methods <- all.equal(cos_sim_reduced_1, cos_sim_reduced_2, tolerance = 1e-10,check.attributes = FALSE)
compare_two_cos_sim_methods
```
### The cosine similarity matrices by two different methods are equal (with tolerance of 1e-10).



9.4 Vergleiche und diskutiere die Unterschiede deiner mittels Cosine Similarity erzeugten Ähnlichkeitsmatrizen für ordinale und normierte Kundenratings mit der Jaccard-basierten Ähnlichkeitsmatrix.

```{r}
compare_cos_Jacc <- all.equal(cos_sim_reduced_1,Jacc_sim_reduced,tolerance = 1e-3,check.attributes = FALSE)
compare_cos_Jacc
```
### The mean relative difference between cosine similarity and jaccard similarity is 2.48.
### Jaccard similarity takes only the unique set of items. The cosine similarity takes the total length of the vectors.



10 Implementierung Top-N Metriken

Aufgabe DIY: Implementiere Funktionen für die Beurteilung der Top-N Metriken Precision und Recall sowie für alle Kunden der Item-space Coverage und Novelty und teste diese mit IBCF Recommendations (Basis: reduzierter Datensatz; N = 5, 10, 15, 20, 25, 30)


10.1 Implementiere eine Funktion, um aus Top-N Listen für alle Kunden die Item-space Coverage@N und Novelty@N eines Recommenders zu beurteilen und teste diese.


```{r}
calc_topn_metrics <- function(mx,split_ratio,N){  # mx: U_I data; split_ratio:train data proportion; n: Top-N
  rrm <- as(mx,"realRatingMatrix")
  # split train, test-known, test_unknown data
  train_test <- evaluationScheme(rrm, method="split", train=split_ratio, k=1, given=20,goodRating=4)
  rrm_train <- getData(train_test,"train")
  rrm_known <- getData(train_test,"known") 
  rrm_unknown <- getData(train_test,"unknown")
  # IBCF model
  model_IBCF <-Recommender(data = rrm_train,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=10))
  # predict Top-N recommendation list
  pred_IBCF <- predict(object = model_IBCF, newdata = rrm_known, n = N,type="topNList")
  
  ####################################
  
  ### accuracy: evaluate the recommendations on "unknown" ratings with metrics precision and recall
  acc_IBCF <- calcPredictionAccuracy(predict(object = model_IBCF, newdata = rrm_known, n = N,type="topNList"),rrm_unknown,given=20,goodRating = 4)
  
  acc_IBCF <- t(as.data.frame(acc_IBCF))
  rownames(acc_IBCF) <- NULL
  
  ####################################
  
  ### item-space coverage: how many percentage of films(from the train data) are in the top-n recommendation lists
  # top n lists for every user
   
  TOP_N_list <- sapply(pred_IBCF@items, function(x) {colnames(as(rrm_known,"matrix"))[x]}) 
  # unique predicted film list of all test users 
  uniq_film_test <- reshape2::melt(as(TOP_N_list,"matrix")) %>% rename(UserID = Var2, rank = Var1, Film_name = value)%>%distinct(Film_name)   # unique film list recommended in test data
  
  # unique film list of the train data
  uniq_film_train <- as.data.frame(t(mx))
  uniq_film_train$cnt <- rowSums(!is.na(uniq_film_train)) # count not NA for each film
  uniq_film_train <- uniq_film_train %>% filter(cnt>0)  # remove the film without any ratings
  
  # calculate the item-space coverage
  coverage <- dim(uniq_film_test)[1] / dim(uniq_film_train)[1] # the coverage
  coverage <- as.data.frame(coverage)
  colnames(coverage) <- "coverage"
  
  ####################################
  
  ### novelty for a given user: ratio of unknown items in the top-n list 
  novelty_table <- data.frame() # an empty dataframe, will be filled with novelty values
  df <- as.data.frame(mx)
  pred_IBCF_all_user <- predict(object = model_IBCF, newdata = rrm, n = N,type="topNList") # predict for all users
  TOP_N_list_all_user <- sapply(pred_IBCF_all_user@items, function(x) {colnames(mx)[x]}) # top-n list for all users
  # df_1: replace the not NA values to the corresponding column name 
  for(i in 1:dim(mx)[1]){
    df_i <- as.data.frame(t(mx))#  
    df_i$Film <- colnames(mx)
    df_i <- df_i[,c(i,(dim(mx)[1]+1))] %>% filter(complete.cases(.)) # list of user-i rated films
    df_i$Film <- rownames(df_i) # add a new column with the same content of rownames
    
    df_top_n <- as.data.frame(TOP_N_list_all_user[,i])   # top-n list of user-i
    colnames(df_top_n) <- "Film"
    
    df_cross <- inner_join(df_i, df_top_n, by="Film")  # inner join the two dataset, we get the rated items in the top-n list
    
    novelty <- 1 - dim(df_cross)[1]/N  # novelty value of user-i
    novelty_table <- rbind(novelty_table, novelty)
  }
  
  novelty_table$UserID <- rownames(mx)
  colnames(novelty_table) <- c("novelty","UserID")
  novelty_table <- novelty_table %>% select(UserID,novelty) # novelty table
  
  ### result of accuracy, coverage, and novelty
  my_list <- list("accuracy" = acc_IBCF,"coverage" = coverage, "novelty" = novelty_table)
  return(my_list) 
}


test <- calc_topn_metrics(mx_reduced,0.8,20)

test$accuracy;test$coverage; test$novelty

```
11 Implementierung Top-N Monitor Aufgabe DIY: Untersuche die relative Übereinstimmung zwischen Top-N Empfehlungen und präferierten Filmen für 4 unterschiedliche Modelle (z.B. IBCF und UBCF mit unterschiedlichen Ähnlichkeits-metriken / Nachbarschaften sowie SVD mit unterschiedlicher Dimensionalitätsreduktion).

11.1 Fixiere 20 zufällig gewählte Testkunden für alle Modellvergleiche,
```{r}
set.seed(578)
train_test_11 <- evaluationScheme(rrm_reduced, method="split", train=0.95, k=1, given=20,goodRating=4) 

# training dataset has 380 users,test dataset has 20 users 
# given=20: For each test user, 20 films per user will be used for prediction, the rest for evaluation)
rrm_reduced_train_11 <- getData(train_test_11,"train")
rrm_reduced_known_11 <- getData(train_test_11,"known") 
rrm_reduced_unknown_11 <- getData(train_test_11,"unknown")

# ICBF models
model_IBCF_cos_10 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=10))
model_IBCF_cos_50 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Cosine",k=50))

model_IBCF_ps_10 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Pearson",k=10))
model_IBCF_ps_50 <-Recommender(data = rrm_reduced_train_11,method="IBCF",parameter=list(normalize = "Z-score",method="Pearson",k=50))

# UBCF models
model_UBCF_cos_10 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Cosine",nn=10))
model_UBCF_cos_50 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Cosine",nn=50))

model_UBCF_ps_10 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Pearson",nn=10))
model_UBCF_ps_50 <-Recommender(data = rrm_reduced_train_11,method="UBCF",parameter=list(normalize = "Z-score",method="Pearson",nn=50))

# SVD models
model_SVD_10 <- Recommender(data = rrm_reduced_train_11,method="SVD",parameter=list(normalize = "Z-score",k=10))
model_SVD_50 <- Recommender(data = rrm_reduced_train_11,method="SVD",parameter=list(normalize = "Z-score",k=50))

# evaluation of the predictions
acc_IBCF_cos_10 <- calcPredictionAccuracy(predict(object = model_IBCF_cos_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_cos_50 <- calcPredictionAccuracy(predict(object = model_IBCF_cos_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_ps_10 <- calcPredictionAccuracy(predict(object = model_IBCF_ps_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_IBCF_ps_50 <- calcPredictionAccuracy(predict(object = model_IBCF_ps_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)

acc_UBCF_cos_10 <- calcPredictionAccuracy(predict(object = model_UBCF_cos_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_cos_50 <- calcPredictionAccuracy(predict(object = model_UBCF_cos_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_ps_10 <- calcPredictionAccuracy(predict(object = model_UBCF_ps_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_UBCF_ps_50 <- calcPredictionAccuracy(predict(object = model_UBCF_ps_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)

acc_SVD_10 <- calcPredictionAccuracy(predict(object = model_SVD_10, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)
acc_SVD_50 <- calcPredictionAccuracy(predict(object = model_SVD_50, newdata = rrm_reduced_known_11, n = 15,type="topNList"),rrm_reduced_unknown_11,given=20,goodRating = 4)

acc_table <- rbind(acc_IBCF_cos_10,acc_IBCF_cos_50,acc_IBCF_ps_10,acc_IBCF_ps_50,acc_UBCF_cos_10,acc_UBCF_cos_50,acc_UBCF_ps_10,acc_UBCF_ps_50,acc_SVD_10,acc_SVD_50)

acc_table
```
### In the IBCF model, both precision and recall are better with cosine method and 50 neighbors.
### In the UBCF model, cosine method and 10 neighbors is a better combination.
### In the SVD model, precision and recall are better with singular value of 10. 
### Through all the models, The SVD model with singular value of 10 has the best precision (0.463) and recall (0.0779).



11.2 Bestimme den Anteil der Top-N Empfehlung nach Genres pro Kunde,

```{r}
Top_n_genre <- function(Top_n_list,n){   # mx is user-item matrix; Top_n_list: top-n matrix; n: the "n" in top-n 
  
  table = data.frame()
  for(i in 1:dim(Top_n_list)[2]){
    df_top <- as.data.frame(Top_n_list[,i])
    colnames(df_top) <- "Film"
    
    df_film_genre_1 <- as.data.frame(mx_film_genre)
    df_film_genre_1$Film <- rownames(df_film_genre_1)
    
    df_top <- left_join(df_top,df_film_genre_1,by=("Film")) %>% select(-Film)
    df_top <- df_top[-1,] 
    total <- sum(df_top)
    df_top["ratio",] <- colSums(df_top)/total
    df_top <- df_top["ratio",]

    table <- rbind(table,df_top)
  }
  rownames(table) <- colnames(Top_n_list)
  return(table)
}


# Top-15 recommendation list of SVD with k = 10
Pred_SVD_k10 <- predict(object = model_SVD_10, newdata = rrm_reduced_known_11, n = 15,type=c("topNList"))
Top15_SVD <- sapply(Pred_SVD_k10@items, function(x) {colnames(df_reduced)[x]})

Top_15_recommendations <- Top_n_genre(Top15_SVD,15) # the percentage of top-15 recommendations by genre per test user

Top_15_recommendations;summary(rowSums(Top_15_recommendations))[-4] # check if the total genre ratio for each user = 1

```
### The table shows the percentage of genres in the top-15 recommendation list per user.
### The sum of each row are all 1, this indicates the total ratio of each user are all correct. 



11.3 Bestimme pro Kunde den Anteil nach Genres seiner Top-Filme (=Filme, welche vom Kunden die besten Bewertungen erhalten haben)

```{r}
Top_film_genre <- function(mx,n){   # mx is user-item matrix, n: top-n rated films
  
  table = data.frame()
  for(i in 1:dim(mx)[1]){
    df_top <- as.data.frame(t(mx))
    df_top$Film <- rownames(df_top)
    df_top <- df_top[,c(i,(dim(mx)[1]+1))]
    colnames(df_top) <- c("Ratings","Film")
    df_top <- df_top %>% filter(complete.cases(.)) %>% arrange(desc(Ratings)) %>% slice(1:n)
    
    df_film_genre_1 <- as.data.frame(mx_film_genre)
    df_film_genre_1$Film <- rownames(df_film_genre_1)
    
    df_left_join <- left_join(df_top,df_film_genre_1,by=("Film"))
    
    df_top <- df_left_join[,-(1:2)] 
    total <- sum(df_top)
    df_top["ratio",] <- colSums(df_top)/total
    df_top <- df_top["ratio",]
    table <- rbind(table,df_top)
  }
  rownames(table) <- rownames(mx)
  return(table)
}

Top_15_films <- Top_film_genre(mx_reduced,15) # genres proportion of the top 15 films for every user
Top_15_films
```
```{r}
summary(rowSums(Top_15_films))[-4]
```
### The first table shows the percentage of genres in the top-15 rated list per user.
### The sum of each row are all 1, this indicates the total ratio of each user are all correct. 



11.4 Vergleiche pro Kunde Top-Empfehlungen und Top-Filmen nach Genres, 

```{r}
# filter the Top-Filmen with the users only appear in the Top-recommendation
Top_15_films_reduced <- Top_15_films
Top_15_films_reduced$UserID <- rownames(Top_15_films_reduced)
Top_15_films_reduced <- Top_15_films_reduced %>% filter(UserID %in% rownames(Top_15_recommendations)) %>% select(-UserID) # with 20 users

# calculate the mean absolute error
MAE_top_genre <- rowSums(abs(Top_15_films_reduced - Top_15_recommendations))/20
"MAE between Top-recommendations and Top-films by genres per user:"; MAE_top_genre;"the five number statistics of the MAE:";summary(MAE_top_genre)[-4]
```


```{r}
"MSE between Top-recommendations and Top-films by genres per user:"
MSE_top_genre <- rowSums((Top_15_films_reduced - Top_15_recommendations)^2)/20
MSE_top_genre;"the five number statistics of the MSE:";summary(MSE_top_genre)[-4]
```


```{r}
"RMSE between Top-recommendations and Top-films by genres per user:"
RMSE_top_genre <- sqrt(rowSums((Top_15_films_reduced - Top_15_recommendations)^2)/20)
RMSE_top_genre;"the five number statistics of the RMSE:";summary(RMSE_top_genre)[-4]

```
### Three quantitativ metrics MAE(mean average error), MSE(mean squared error), RMSE(the root mean squared error) were used to compare the difference between the top-recommendations and top-rated-films by genres.



11.5 Definiere eine Qualitätsmetrik für Top-N Listen und teste sie. 

```{r}
# MAP: Average Precision and Mean Average Precision

MAP <- function(mx,Top_n_list,n){
  # extract the users in the Top-n lists, or use direct the test dataset.
  mx_part <- as.data.frame(mx) %>% filter(rownames(as.data.frame(mx)) %in% colnames(as.data.frame(Top_n_list)))# user_film
  mx_part <- as.data.frame(t(mx_part)) # transpose mx_part_users to film_user
  mx_part$Film <- rownames(mx_part) # generate new column "Film" same as the rownames
  Top_n_list <- as.data.frame(Top_n_list)
  Top_n_list$Rank <- 1:n # generate new column "Rank" to represent the ranks of the recommended films
  
  summe_precision <- 0
  for(i in (dim(mx_part)[2]-1)){
    Top_i <- Top_n_list[,c(all_of(i),dim(Top_n_list)[2])] # extract the top_n_list and rank of the i-th user
    colnames(Top_i) <- c("Film","Rank") # rename the columns as "Film" and "Rank"
    mx_i <- mx_part %>% select(c(all_of(i),dim(mx_part)[2]))       # mx_i: extract the ratings and Film names of i-th user from rating matrix
    colnames(mx_i) <- c("Rating","Film")
    mx_join <- left_join(Top_i,mx_i,by="Film") %>% filter(Rating>3) # left_join the ratings to the Top-n list by "Film". mx_1 has the columns of "Film", "Rank", and ratings; filter the relevant items (ratings greater than 3); 
    mx_join <- mx_join %>% mutate(Numerator = 1:dim(mx_join)[1],Precision = Numerator/Rank) # generate new column "Numerator" which is a new rank only for the relevant items, and new column "Precision" which is the Precision of every relevant items.

    avg_user_i_precision <- mean(mx_join$Precision)  # average precision of user-i
    summe_precision <- summe_precision + avg_user_i_precision
  }
  map <- summe_precision/(dim(Top_n_list)[2]-1) # mean average precision
  return(map)
}

MAP(mx_reduced,TOP15_IBCF,15)
```
### firstly, for one user, find out the rank of the m-th relevant item (rating > 3) in the top_n_list, then calculate the precision: m/n.
### secondly, calculate the precisions of all relevant items.
### the average precision of one user: average all the precision of relevant items.
### MAP: average precision of all users.